# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install(c("GEOquery", "edgeR", "limma", "ggplot2", "reshape2"))
library(GEOquery)
library(edgeR)
library(limma)
library(ggplot2)
library(reshape2)
library(ggrepel)
library(biomaRt)
library(DT)
library(pheatmap)
# # Download supplementary files for GSE203206 [https://molecularbrain.biomedcentral.com/articles/10.1186/s13041-022-00963-2#MOESM2] if the directory doesn't exist.
# if (!dir.exists("GSE203206")) {
# getGEOSuppFiles("GSE203206", makeDirectory = TRUE, baseDir = ".")
# }
#
# url <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE203nnn/GSE203206/suppl/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz"
# destfile <- "./GSE203206/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz"
# dir.create("./GSE203206", showWarnings = FALSE)
# download.file(url, destfile, mode = "wb")
# Download the GEO dataset
getGEOSuppFiles("GSE203206", makeDirectory = TRUE)counts <- read.table("./GSE203206/GSE203206_Subramaniam.ADRC_brain.counts.tsv.gz",
header = TRUE, sep = "\t", row.names = 1)
metadata <- read.table("./GSE203206/GSE203206_Subramaniam.ADRC_brain.metadata.tsv.gz",
header = TRUE, sep = "\t", row.names = 1)
# Define ALZHEIMER_TYPE based on 'Sample' prefix and filter only Early and Control
metadata$ALZHEIMER_TYPE <- NA
metadata$ALZHEIMER_TYPE[grepl("^EOL", metadata$Sample)] <- "Early"
metadata$ALZHEIMER_TYPE[grepl("^COL", metadata$Sample)] <- "Control"
metadata$ALZHEIMER_TYPE[grepl("^LOL", metadata$Sample)] <- "Late"
metadata <- metadata[!is.na(metadata$ALZHEIMER_TYPE), ]
metadata$ALZHEIMER_TYPE <- factor(metadata$ALZHEIMER_TYPE, levels = c("Control", "Early", "Late"))## Counts matrix (first few rows):
## E_4920_OL_RNA_S1 E_5273_OL_RNA_S75 E_5618_OL_RNA_S62
## ENSG00000000003 51.68859 98.80348 47.63177
## ENSG00000000419 13.05284 28.37234 27.76082
## ENSG00000000457 185.46142 152.97943 113.97631
## ENSG00000000460 53.91895 51.50589 47.99208
## ENSG00000000938 44.99193 32.42752 57.72998
## ENSG00000000971 153.93871 110.24908 123.04170
## L_5342_OL_RNA_S95 L_5425_OL_RNA_S39 L_5492_OL_RNA_S78
## ENSG00000000003 43.932208 13.21789 24.11123
## ENSG00000000419 27.508268 12.08312 17.15611
## ENSG00000000457 112.441081 46.99159 103.12127
## ENSG00000000460 45.338682 37.11980 30.61393
## ENSG00000000938 7.129156 38.70423 25.22177
## ENSG00000000971 33.344420 52.37255 43.59417
## L_5566_OL_RNA_S92 L_5569_OL_RNA_S9 L_5594_OL_RNA_S65
## ENSG00000000003 42.24082 68.04576 69.49676
## ENSG00000000419 7.50626 0.00000 0.00000
## ENSG00000000457 60.79745 141.47573 92.16050
## ENSG00000000460 46.27109 53.75986 54.60007
## ENSG00000000938 29.92210 39.33452 53.73396
## ENSG00000000971 60.26747 182.64778 490.23938
## L_5017_OL_RNA_S87 L_5147_OL_RNA_S22 L_5197_OL_RNA_S20
## ENSG00000000003 42.984646 69.34930 33.73442
## ENSG00000000419 3.910817 6.56959 10.82245
## ENSG00000000457 80.805508 78.06097 103.28810
## ENSG00000000460 39.482685 56.00870 42.18951
## ENSG00000000938 46.092878 113.67277 49.17866
## ENSG00000000971 64.029192 517.03545 165.37552
## L_5252_OL_RNA_S19 L_5275_OL_RNA_S57 E_4934_OL_RNA_S40
## ENSG00000000003 32.38435 89.01499 18.057221
## ENSG00000000419 12.32396 72.18759 5.649749
## ENSG00000000457 77.45872 126.65909 22.995939
## ENSG00000000460 42.04894 80.52346 16.513741
## ENSG00000000938 33.92262 59.54354 16.998740
## ENSG00000000971 57.24617 204.16033 57.242921
## E_5047_OL_RNA_S43 E_5082_OL_RNA_S80 E_5107_OL_RNA_S98
## ENSG00000000003 5.983025 29.940971 25.84285907
## ENSG00000000419 0.000000 1.598278 0.01694824
## ENSG00000000457 6.046037 21.482075 9.53858873
## ENSG00000000460 1.496420 11.022296 4.65642648
## ENSG00000000938 31.442038 54.809737 46.04421144
## ENSG00000000971 28.095790 36.655915 15.86122752
## E_5333_OL_RNA_S25 E_5337_OL_RNA_S12 E_5419_OL_RNA_S14
## ENSG00000000003 37.83331 24.491256 25.455549
## ENSG00000000419 2.87569 1.186218 0.000000
## ENSG00000000457 42.46450 29.593666 7.574574
## ENSG00000000460 75.04347 22.767128 4.998618
## ENSG00000000938 80.28758 64.598344 15.084744
## ENSG00000000971 65.91267 108.837765 17.410583
## E_5454_OL_RNA_S21 E_5537_OL_RNA_S76 E_5585_OL_RNA_S6
## ENSG00000000003 30.94204 37.875627 57.640603
## ENSG00000000419 10.98872 9.031095 5.696223
## ENSG00000000457 72.37977 58.859506 35.689411
## ENSG00000000460 42.69146 26.252744 27.578701
## ENSG00000000938 94.16010 84.239635 83.229502
## ENSG00000000971 120.31361 97.306828 207.095468
## E_5020_OL_RNA_S79 E_4988_OL_RNA_S16 E_5113_OL_RNA_S23
## ENSG00000000003 36.82717 74.14136 45.952496
## ENSG00000000419 15.58039 11.53970 6.029349
## ENSG00000000457 46.25728 54.53149 72.955328
## ENSG00000000460 23.18493 26.59712 58.213898
## ENSG00000000938 109.96816 119.96690 95.993383
## ENSG00000000971 56.91853 219.50860 358.232096
## L_4950_OL_RNA_S64 L_5303_OL_RNA_S24 L_5366_OL_RNA_S5
## ENSG00000000003 20.566745 15.920322 19.41930
## ENSG00000000419 2.775672 8.057642 6.89522
## ENSG00000000457 57.421866 25.931982 31.65121
## ENSG00000000460 18.465445 13.176996 20.82778
## ENSG00000000938 36.424041 66.121533 41.98890
## ENSG00000000971 104.159476 62.262032 71.94089
## L_5376_OL_RNA_S61 L_5526_OL_RNA_S100 L_4964_OL_RNA_S32
## ENSG00000000003 14.937899 49.279873 2.567742e+01
## ENSG00000000419 3.322799 4.965268 9.091280e-08
## ENSG00000000457 21.781072 60.955968 4.192650e+01
## ENSG00000000460 11.784071 28.554476 3.467784e+01
## ENSG00000000938 45.589496 93.679074 9.661300e+01
## ENSG00000000971 41.879787 240.504291 7.947590e+01
## L_5065_OL_RNA_S37 L_5269_OL_RNA_S30 E_5184_OL_RNA_S46
## ENSG00000000003 85.24805 108.00403 41.046963
## ENSG00000000419 21.02972 0.00000 6.910645
## ENSG00000000457 61.41075 46.76018 48.171061
## ENSG00000000460 55.30758 35.88983 26.526717
## ENSG00000000938 122.93885 100.62998 133.947474
## ENSG00000000971 179.68781 270.17228 214.158209
## E_5240_OL_RNA_S73 E_5000_OL_RNA_S58 L_5394_OL_RNA_S4
## ENSG00000000003 75.89399 60.12211 54.094489
## ENSG00000000419 0.00000 4.95507 3.224707
## ENSG00000000457 44.71548 38.46552 19.841089
## ENSG00000000460 10.66665 38.88151 18.898575
## ENSG00000000938 98.34171 125.86505 148.393369
## ENSG00000000971 280.35927 801.86969 499.991450
## C_5628_OL_RNA_S44 C_5114_OL_RNA_S69 C_4942_OL_RNA_S90
## ENSG00000000003 23.76476 41.07581 28.04265
## ENSG00000000419 10.19532 11.45113 19.52941
## ENSG00000000457 94.32358 113.85271 71.97302
## ENSG00000000460 38.22775 40.30357 23.39992
## ENSG00000000938 17.07324 52.20809 25.28685
## ENSG00000000971 71.01627 188.12193 33.75536
## C_4954_OL_RNA_S28 C_5709_OL_RNA_S18 C_5070_OL_RNA_S60
## ENSG00000000003 52.16995 64.97861 62.994313
## ENSG00000000419 21.18185 25.99144 1.628206
## ENSG00000000457 74.81350 110.67929 104.467279
## ENSG00000000460 71.26355 49.29619 42.416638
## ENSG00000000938 86.49208 39.75260 99.771111
## ENSG00000000971 301.15890 494.08391 68.041632
## C_4870_OL_RNA_S68 C_5248_OL_RNA_S82
## ENSG00000000003 103.671885 69.463193
## ENSG00000000419 4.207163 5.521627
## ENSG00000000457 151.904752 174.692519
## ENSG00000000460 67.153599 55.376823
## ENSG00000000938 52.099429 36.343606
## ENSG00000000971 95.683895 207.515722
##
## Metadata (first few rows):
## PATHID PMHOURS PATHDX SEX AGE onsetAge APOE BRAAK1
## E_4920_OL_RNA_S1 4920 NA Alzheimers M 67 60 34 6
## E_5273_OL_RNA_S75 5273 10 Alzheimers M 41 35 33 6
## L_5342_OL_RNA_S95 5342 10 Alzheimers F 83 76 33 6
## L_5425_OL_RNA_S39 5425 6 Alzheimers F 80 73 34 6
## L_5492_OL_RNA_S78 5492 6 Alzheimers M 79 72 33 6
## L_5566_OL_RNA_S92 5566 6 Alzheimers M 84 72 33 6
## BRAAK.combo MFTANGLES IPTANGLES STTANGLES HPTANGLES MFPLAQUES
## E_4920_OL_RNA_S1 6.3 7 6 12 10 50
## E_5273_OL_RNA_S75 6.2 19 19 7 22 43
## L_5342_OL_RNA_S95 6.2 4 4 6 17 24
## L_5425_OL_RNA_S39 6.2 5 8 8 17 50
## L_5492_OL_RNA_S78 6.2 5 4 4 18 48
## L_5566_OL_RNA_S92 6.2 4 5 26 26 41
## IPPLAQUES STPLAQUES HPPLAQUES SUM_TANGLES EDUCATION BLESSED
## E_4920_OL_RNA_S1 50 50 25 35 14 33
## E_5273_OL_RNA_S75 50 44 46 67 16 6
## L_5342_OL_RNA_S95 21 35 8 31 20 16
## L_5425_OL_RNA_S39 50 34 16 38 16 30
## L_5492_OL_RNA_S78 43 35 13 31 12 16
## L_5566_OL_RNA_S92 43 -4 9 61 12 21
## DURATION MMSE DRS Sample MappingRate RIN ALZHEIMER_TYPE
## E_4920_OL_RNA_S1 8 0 60 EOL1 0.406 2.9 Early
## E_5273_OL_RNA_S75 6 NA 81 EOL12 0.408 3.6 Early
## L_5342_OL_RNA_S95 5 18 88 LOL11 0.395 2.1 Late
## L_5425_OL_RNA_S39 8 0 58 LOL15 0.325 2.5 Late
## L_5492_OL_RNA_S78 7 21 114 LOL16 0.333 3.3 Late
## L_5566_OL_RNA_S92 12 16 91 LOL19 0.370 3.1 Late
##
## Counts per Sample in PATHDX group (Healthy vs Alzheimer):
##
## Alzheimers Healthy
## 39 8
# Se a variável de early vs late estiver presente, por exemplo `ALZHEIMER_TYPE`
# Adicione essa análise também:
if ("Sample" %in% colnames(metadata)) {
cat("\nSample Count per Alzheimer onset (Healthy, Early, Late):\n")
print(table(metadata$ALZHEIMER_TYPE))
} else {
cat("\nColumn 'ALzheimer Type' not found in metadata.\n")
}##
## Sample Count per Alzheimer onset (Healthy, Early, Late):
##
## Control Early Late
## 8 19 20
ggplot(metadata, aes(x = ALZHEIMER_TYPE, fill = ALZHEIMER_TYPE)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribuition of samples per group", x = "Group", y = "No of samples") +
theme(legend.position = "none")# New matrix without interception
design <- model.matrix(~ 0 + ALZHEIMER_TYPE, data = metadata)
colnames(design) <- levels(metadata$ALZHEIMER_TYPE)
cat("\nDesign matrix for comparisson between Control, Early, Late:\n")##
## Design matrix for comparisson between Control, Early, Late:
## Control Early Late
## E_4920_OL_RNA_S1 0 1 0
## E_5273_OL_RNA_S75 0 1 0
## E_5618_OL_RNA_S62 0 1 0
## L_5342_OL_RNA_S95 0 0 1
## L_5425_OL_RNA_S39 0 0 1
## L_5492_OL_RNA_S78 0 0 1
# Define contrasts for desired comparisons
contrast.matrix <- makeContrasts(
Early_vs_Control = Early - Control,
Late_vs_Control = Late - Control,
Early_vs_Late = Early - Late,
levels = design
)##
## Design matrix (first few rows):
## Control Early Late
## E_4920_OL_RNA_S1 0 1 0
## E_5273_OL_RNA_S75 0 1 0
## E_5618_OL_RNA_S62 0 1 0
## L_5342_OL_RNA_S95 0 0 1
## L_5425_OL_RNA_S39 0 0 1
## L_5492_OL_RNA_S78 0 0 1
# Fitting model and apply contrast<a
fit <- lmFit(v, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
# Extrair resultados de cada contraste
res_early_vs_ctrl <- topTable(fit2, coef = "Early_vs_Control", number = Inf)
res_late_vs_ctrl <- topTable(fit2, coef = "Late_vs_Control", number = Inf)
res_early_vs_late <- topTable(fit2, coef = "Early_vs_Late", number = Inf)
# Adicionar IDs
res_early_vs_ctrl$gene_id <- rownames(res_early_vs_ctrl)
res_late_vs_ctrl$gene_id <- rownames(res_late_vs_ctrl)
res_early_vs_late$gene_id <- rownames(res_early_vs_late)
# Salvar resultados
write.csv(res_early_vs_ctrl, "DGE_Early_vs_Control.csv", row.names = FALSE)
write.csv(res_late_vs_ctrl, "DGE_Late_vs_Control.csv", row.names = FALSE)
write.csv(res_early_vs_late, "DGE_Early_vs_Late.csv", row.names = FALSE)
# Contar genes significativos (ajuste FDR < 0.05 e |logFC| > 1)
cat("\nSummary of differentially expressed genes:\n")##
## Summary of differentially expressed genes:
cat("Early vs Control:", sum(res_early_vs_ctrl$adj.P.Val < 0.05 & abs(res_early_vs_ctrl$logFC) > 1), "genes\n")## Early vs Control: 400 genes
cat("Late vs Control:", sum(res_late_vs_ctrl$adj.P.Val < 0.05 & abs(res_late_vs_ctrl$logFC) > 1), "genes\n")## Late vs Control: 0 genes
cat("Early vs Late:", sum(res_early_vs_late$adj.P.Val < 0.05 & abs(res_early_vs_late$logFC) > 1), "genes\n")## Early vs Late: 0 genes
The voom mean-variance trend plot demonstrates that the relationship between gene expression level and variance has been successfully modeled. The red trend line shows a smooth decrease in variance as the mean expression increases, which is expected in RNA-seq data and confirms that lowly expressed genes tend to exhibit higher variability. The stabilization of variance at higher expression levels indicates that the data are well-behaved and suitable for downstream linear modeling with limma. Overall, this plot validates the voom transformation and supports the reliability of the differential expression results.
results_limma <- topTable(fit, coef = 2, number = Inf, sort.by = "none")
results_limma$gene_id <- rownames(results_limma)
final_output <- results_limma[, c("gene_id", "logFC", "adj.P.Val")]
write.csv(final_output, file = "DGE_results_limma_voom.csv", row.names = FALSE)##
## Differential Expression Analysis Complete. Results saved as 'DGE_results_limma_voom.csv'.
##
## ------ Metadata Analysis ------
##
## Summary of Metadata:
## PATHID PMHOURS PATHDX SEX
## Min. :4870 Min. : 2.0 Length:47 Length:47
## 1st Qu.:5056 1st Qu.: 6.0 Class :character Class :character
## Median :5252 Median : 8.0 Mode :character Mode :character
## Mean :5251 Mean : 9.5
## 3rd Qu.:5422 3rd Qu.:12.0
## Max. :5709 Max. :39.0
## NA's :9
## AGE onsetAge APOE BRAAK1
## Min. :41.00 Min. :35.00 Min. :23.00 Min. :0.000
## 1st Qu.:67.25 1st Qu.:55.50 1st Qu.:33.00 1st Qu.:6.000
## Median :79.00 Median :71.00 Median :33.00 Median :6.000
## Mean :74.20 Mean :62.74 Mean :33.18 Mean :5.128
## 3rd Qu.:83.00 3rd Qu.:73.00 3rd Qu.:34.00 3rd Qu.:6.000
## Max. :97.00 Max. :78.00 Max. :34.00 Max. :6.000
## NA's :1 NA's :8 NA's :2
## BRAAK.combo MFTANGLES IPTANGLES STTANGLES
## Min. :6.200 Min. : 2.000 Min. : 3.000 Min. : 2.00
## 1st Qu.:6.200 1st Qu.: 4.000 1st Qu.: 5.500 1st Qu.: 5.50
## Median :6.200 Median : 6.000 Median : 7.000 Median :10.00
## Mean :6.218 Mean : 7.385 Mean : 8.179 Mean :11.31
## 3rd Qu.:6.200 3rd Qu.: 8.000 3rd Qu.:10.500 3rd Qu.:12.00
## Max. :6.300 Max. :24.000 Max. :19.000 Max. :66.00
## NA's :8 NA's :8 NA's :8 NA's :8
## HPTANGLES MFPLAQUES IPPLAQUES STPLAQUES
## Min. : 3.00 Min. :24.00 Min. :-4.00 Min. :-4.00
## 1st Qu.:13.50 1st Qu.:49.50 1st Qu.:46.00 1st Qu.:35.00
## Median :22.00 Median :50.00 Median :50.00 Median :47.00
## Mean :22.59 Mean :47.56 Mean :44.46 Mean :41.77
## 3rd Qu.:27.50 3rd Qu.:50.00 3rd Qu.:50.00 3rd Qu.:50.00
## Max. :66.00 Max. :50.00 Max. :50.00 Max. :50.00
## NA's :8 NA's :8 NA's :8 NA's :8
## HPPLAQUES SUM_TANGLES EDUCATION BLESSED DURATION
## Min. : 6.00 Min. : 19.00 Min. :10.00 Min. : 4.00 Min. : 5
## 1st Qu.:14.00 1st Qu.: 31.00 1st Qu.:12.00 1st Qu.:11.00 1st Qu.: 8
## Median :18.00 Median : 39.00 Median :14.00 Median :30.00 Median :11
## Mean :22.05 Mean : 49.46 Mean :14.36 Mean :23.97 Mean :10
## 3rd Qu.:25.50 3rd Qu.: 60.50 3rd Qu.:16.00 3rd Qu.:33.00 3rd Qu.:12
## Max. :50.00 Max. :147.00 Max. :20.00 Max. :85.00 Max. :15
## NA's :8 NA's :8 NA's :11 NA's :8 NA's :11
## MMSE DRS Sample MappingRate
## Min. : 0.000 Min. : 5.00 Length:47 Min. :0.1150
## 1st Qu.: 0.000 1st Qu.: 33.50 Class :character 1st Qu.:0.3295
## Median : 2.000 Median : 43.00 Mode :character Median :0.3750
## Mean : 7.032 Mean : 57.97 Mean :0.3608
## 3rd Qu.:16.000 3rd Qu.: 89.00 3rd Qu.:0.4005
## Max. :28.000 Max. :121.00 Max. :0.5020
## NA's :16 NA's :8
## RIN ALZHEIMER_TYPE
## Min. :1.00 Control: 8
## 1st Qu.:1.75 Early :19
## Median :2.20 Late :20
## Mean :2.24
## 3rd Qu.:2.65
## Max. :3.60
##
The metadata analysis reveals a cohort of 47 samples with a balanced distribution across Alzheimer’s disease stages: 8 control, 19 early-stage, and 20 late-stage cases. The average age was 74.2 years, with disease onset typically occurring around 62.7 years. Neuropathological markers such as BRAAK stages and plaque/tangle scores show expected variability, with several missing values across key measures. Cognitive scores (MMSE, DRS) and biological quality metrics (RIN, MappingRate) also vary, reflecting a heterogeneous dataset suitable for downstream stratified analyses.
##
## Descriptive Statistics for AGE:
## Mean Age: 74.19565
## Median Age: 79
## Age Range: 41 - 97
##
## Descriptive Statistics for EDUCATION:
## Mean Education: 14.36111
## Median Education: 14
## Education Range: 10 - 20
##
## Metadata analysis complete.
Descriptive statistics indicate that participants had a mean age of 74.2 years (range: 41–97) and a median of 79, suggesting a slightly right-skewed age distribution. Education levels were relatively high, with a mean of 14.4 years and a range of 10 to 20 years, indicating a well-educated cohort. These demographic features provide important context for interpreting clinical and molecular findings in the dataset.
# Filter low count genes
keep <- filterByExpr(dge, design)
dge <- dge[keep, , keep.lib.sizes=FALSE]
# Normalize with TMM
dge <- calcNormFactors(dge)
# Estimate dispersion
dge <- estimateDisp(dge, design)
# Fit the model with GLM
fit <- glmFit(dge, design)
# Perform statistical test (Likelihood Ratio Test)
lrt <- glmLRT(fit)
# View top differentially expressed genes
topTags(lrt)## Coefficient: Late
## logFC logCPM LR PValue FDR
## ENSG00000180061 -20.61724 0.009724043 1760.337 0 0
## ENSG00000259964 -20.50310 0.083881772 1817.169 0 0
## ENSG00000261696 -20.45203 0.130828238 1898.934 0 0
## ENSG00000204118 -20.44666 0.181243181 3396.740 0 0
## ENSG00000230465 -20.40902 0.212726469 3583.873 0 0
## ENSG00000116194 -20.39605 0.261833078 2178.738 0 0
## ENSG00000234737 -20.38718 0.225428248 1553.236 0 0
## ENSG00000197813 -20.38375 0.108890513 2101.287 0 0
## ENSG00000287260 -20.38003 0.285945836 3863.954 0 0
## ENSG00000241345 -20.37694 0.479381725 1628.000 0 0
# Connect to Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Convert Ensembl IDs
converted <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = results$Gene,
mart = ensembl
)
# Merge with results
results_annot <- merge(results, converted, by.x = "Gene", by.y = "ensembl_gene_id")
# Filter significant genes (e.g., FDR < 0.05 and |log2FC| > 1)
results_annot$Significant <- ifelse(results_annot$FDR < 0.05 & abs(results_annot$logFC) > 1, "yes", "no")
# Select top genes for labeling
top_genes <- subset(results_annot, Significant == "yes")# Unir resultados corretos ao gene annotation
results_early_vs_ctrl <- topTable(fit2, coef = "Early_vs_Control", number = Inf)
results_early_vs_ctrl$Gene <- rownames(results_early_vs_ctrl)
# Anotar com biomaRt
library(biomaRt)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
converted <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = results_early_vs_ctrl$Gene,
mart = ensembl
)
results_annot <- merge(results_early_vs_ctrl, converted,
by.x = "Gene", by.y = "ensembl_gene_id")
# Marcar genes significativos
results_annot$Significant <- ifelse(
results_annot$adj.P.Val < 0.05 & abs(results_annot$logFC) > 1,
"yes", "no"
)
# Limitar valores extremos
results_annot$logFC <- pmax(pmin(results_annot$logFC, 5), -5)
results_annot$log10FDR <- -log10(pmax(results_annot$adj.P.Val, 1e-10))
# Selecionar top genes para label
top_genes <- results_annot %>%
dplyr::filter(Significant == "yes") %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 15)
# Volcano plot
ggplot(results_annot, aes(x = logFC, y = log10FDR, color = Significant)) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("no" = "grey", "yes" = "coral")) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
ggrepel::geom_text_repel(
data = top_genes,
aes(label = hgnc_symbol),
size = 3,
max.overlaps = 20,
box.padding = 0.5
) +
theme_minimal() +
labs(
title = "Volcano Plot: Early vs Control",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-Value"
)# Verifica se os genes estão no topo dos resultados
subset(results_annot, hgnc_symbol %in% c("EPHA5-AS1", "LDHAP4", "LIRIL2R" , "CAMK1G" ,"TMEM54", "FOXO1", "NELL1", "MDGA1", "OSGEP", "UBXN10", "CA11P", "SYTL2", "FOXQ1", "ABCA11P", "FLT3"))[, c("hgnc_symbol", "logFC", "adj.P.Val")]A subset of differentially expressed genes between Early Alzheimer and Control samples revealed both upregulated and downregulated transcripts. Notably, LIRIL2R, EPHA5-AS1, and ABCA11P showed strong downregulation in Early samples (logâ‚‚FC < -1.7, adj. p < 0.05), while FOXQ1 and TMEM54 were among the most upregulated. While FOXO1 appeared visually in the volcano plot, its adjusted p-value (0.109) indicates it did not pass statistical significance thresholds, highlighting the importance of careful result interpretation.
The density plot and boxplot of normalized gene expression values demonstrate successful normalization across samples. The density curves overlap substantially, indicating consistent global expression distributions and absence of large-scale technical bias. Similarly, the boxplot shows aligned medians and comparable interquartile ranges across samples, suggesting that expression values are well-centered and scaled. The presence of outliers is expected in RNA-seq data and reflects biological variability rather than technical artifacts. Together, these plots confirm the data is suitable for downstream differential expression analysis.
# Filtering for Early vs Control
metadata_filtered <- metadata[metadata$ALZHEIMER_TYPE %in% c("Control", "Early"), ]
# Making sure v$E are the filtered samples
v_filtered <- v$E[, rownames(metadata_filtered)]
# PCA
pca <- prcomp(t(v_filtered))
pca_df <- data.frame(pca$x[, 1:2], Group = metadata_filtered$ALZHEIMER_TYPE)
# Plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA: Early vs Control", x = "PC1", y = "PC2")ggplot(results_annot, aes(x = logFC)) +
geom_histogram(bins = 60, fill = "skyblue", color = "black") +
theme_minimal() +
labs(title = "Distribution of Log2 Fold Change", x = "logFC")The distribution of log2 fold changes between Early Alzheimer and Control samples follows a near-normal pattern centered around zero, indicating that the majority of genes exhibit no substantial expression change between groups. This symmetry and concentration around zero are expected in transcriptome-wide analyses, with only a minority of genes showing biologically meaningful up- or downregulation. The tails of the distribution highlight differentially expressed genes and are consistent with the genes identified in the volcano plot.
We assessed potential outliers and batch effects using PCA and sample metadata variables (e.g., RIN, PMHOURS, MappingRate). This is essential to ensure that technical variation does not confound biological interpretation.
# Match expression data
v_filtered <- v$E[, rownames(metadata_filtered)]
# Perform PCA
pca <- prcomp(t(v_filtered), scale. = TRUE)
pca_df <- data.frame(pca$x[, 1:2],
RIN = metadata_filtered$RIN, PMHOURS = metadata_filtered$PMHOURS,
MappingRate = metadata_filtered$MappingRate)
# PCA colored by RIN
ggplot(pca_df, aes(x = PC1, y = PC2, color = RIN)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by RIN", x = "PC1", y = "PC2")# PCA colored by PMHOURS
ggplot(pca_df, aes(x = PC1, y = PC2, color = PMHOURS)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by PMHOURS", x = "PC1", y = "PC2")# PCA colored by Mapping Rate
ggplot(pca_df, aes(x = PC1, y = PC2, color = MappingRate)) + geom_point(size = 3) + scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "PCA Colored by Mapping Rate", x = "PC1", y = "PC2")# Distance from origin in PCA space
dists <- sqrt(rowSums(pca$x[, 1:2]^2))
threshold <- mean(dists) + 3 * sd(dists)
metadata_filtered$Outlier <- dists > threshold
# Visualize outliers
pca_df$Outlier <- metadata_filtered$Outlier
ggplot(pca_df, aes(x = PC1, y = PC2, color = Outlier)) +
geom_point(size = 3) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
theme_minimal() +
labs(title = "PCA: Outlier Detection", x = "PC1", y = "PC2")# Remove effect of RIN, if it's a confounder
library(limma)
v_corrected <- removeBatchEffect(v_filtered, covariates = metadata_filtered$RIN)
# PCA after correction
pca_corr <- prcomp(t(v_corrected), scale. = TRUE)
pca_corr_df <- data.frame(pca_corr$x[, 1:2], Group = metadata_filtered$ALZHEIMER_TYPE)
ggplot(pca_corr_df, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA After Batch Effect Correction", x = "PC1", y = "PC2")Batch effect correction was performed using the
limma::removeBatchEffect() function, accounting for RIN and
MappingRate as covariates. PCA plots after correction show reduced
technical bias, with improved separation of biological groups (Early vs
Control), suggesting that the correction was effective.